Import data

For more data field references: https://caaspp-elpac.ets.org/caaspp/ResearchFileFormatSB?ps=true&lstTestYear=2022&lstTestType=B

Wrangling

TODO:



Decisions:

  • keep charter schools for now
  • drop “declined to state” and keep only one variable for parental education
  • use “Currently EL/or not” as the one variable for ELLs
select_cols <- c('School.Code',
                  'Student.Group.ID',
                  'Students.Tested')

# All Students  001
# Disability Status Reported disabilities   128
# Economic Status   Economically disadvantaged  031
# English-Language Fluency EL (English learner) 160
# Race and Ethnicity    
## American Indian or Alaska Native 075, Asian  076, Black or African American  074
## Filipino 077, Hispanic or Latino 078, Native Hawaiian or Pacific Islander    079
## White    080, Two or more races  144
# Gender    Female  004
# Parent Education  
## Not a high school graduate   090
select_groups <- c(1, 128, 31, 160, 75, 76, 74, 77, 78, 79, 80, 144, 4, 90)

caaspp22_ela_pivoted <- caaspp22 %>% 
  # Grade 13 means all grades, Test.ID = 1 (ELA)
  filter(Grade==13 & Test.ID==1) %>%
  # public schools (7), Direct(9)/Locally(10) Funded Charter School
  filter(Type.ID==7 | Type.ID==9 | Type.ID==10) %>%
  select(all_of(select_cols)) %>%
  filter(Student.Group.ID %in% select_groups) %>%
  # merging on student group name
  left_join(y=student_groups, by = c("Student.Group.ID" = "Demographic ID")) %>% 
  rename("Demographic.ID.Num" = "Demographic ID Num",
         "Demographic.Name" = "Demographic Name") %>%
  select(all_of(c('School.Code',
                  'Students.Tested',
                  'Demographic.Name'))) %>%
  # pivot from long from to wide
  # each student group for a school is pivoted from a row to a val in column
  pivot_wider(names_from = Demographic.Name, values_from = 'Students.Tested')

After we pivot the original dataset, we have 10257 rows (public schools, direct funded charter school and locally funded) and 16 variables (school code, school size, and other 14 student group size). for the school size as well as the size of each student group, the value in the cell is simply the cardinality/count.

Select Sample

Grade Spans

caaspp22_grade <- caaspp22 |> 
  group_by(School.Code) |> # 10,258 Unique School Codes
  filter(School.Code != 0 & Grade != 13) |>
  mutate(Grade.Count = n_distinct(Grade)) |>
  mutate(Grade.List = paste0(unique(Grade), collapse = ','))

grade_count <- caaspp22 |> 
  group_by(School.Code) |> # 10,258 Unique School Codes
  filter(School.Code != 0 & Grade != 13) |>
  summarise(Grade.Count = n_distinct(Grade))

grade_list <- caaspp22 |> 
  group_by(School.Code) |> # 10,258 Unique School Codes
  filter(School.Code != 0 & Grade != 13) |>
  summarise(Grade.List = paste0(unique(Grade), collapse = ','))

grades <- grade_count |>
  left_join(y=grade_list, by=c("School.Code" = "School.Code"))

school_type_table <- as.data.frame(
  table(grades$Grade.List, grades$Grade.Count)) |>
  setNames(c('grade_list', 'grade_count', 'freq')) |>
  filter(freq != 0)

# write.csv(school_type_table,file='data/school_type.csv')

typical_span <- c(
  '3,4,5', '3,4,5,6', '3,4,5,6,7,8', '3,4,5,6,7,8,11', '6,7,8', '6,7,8,11', '7,8', '7,8,11')

grade_atypical_span <- grades |>
  filter(!Grade.List %in% typical_span)

is_grade_atypical_span <- grade_atypical_span$School.Code

In our attempt to tackle the inconsistency in the combinations of grade levels, we eliminate the non-traditional schools because they are probably not representative of schools as a whole and are likely going through some type of transition (opening/closing/expanding). We consulted Batson from Ravenswood District, and decided to include schools with the following grade spans:

  • 3,4,5
  • 3,4,5,6
  • 3,4,5,6,7,8
  • 3,4,5,6,7,8,11
  • 6,7,8
  • 6,7,8,11
  • 7,8
  • 7,8,11

This includes 74% of the data (7629/10258 schools) and would give more confidence that schools are more directly comparable.

Outcome inspection

grade_non_missing <- caaspp22 |>
  # select 10258 schools (1 more since there is one school only have math scores)
  filter(School.Code > 0) |>
  # select only ELA results
  filter(Test.ID==1) |>
  # select `All Students`
  filter(Student.Group.ID == 1) |>
  filter(Grade != 13) |>
  # select rows where `Score` is '' or '*'
  filter(Mean.Scale.Score != '' & Mean.Scale.Score != '*') |> 
  group_by(School.Code) |>
  summarise(Non.Missing.Grade.Count = n_distinct(Grade), 
            Non.Missing.Grade.List = paste0(unique(Grade), collapse = ','))

# find schools where there are all missing grades 
grade_all_missing <- grade_non_missing |>
  right_join(y=grades, by = c("School.Code" = "School.Code")) |>
  filter(is.na(Non.Missing.Grade.Count))

# find schools where there is only one non-missing grade and it is grade 11
grade_all_missing_but_11 <- grade_non_missing |>
  right_join(y=grades, by = c("School.Code" = "School.Code")) |>
  filter(Non.Missing.Grade.Count == 1 & Grade.Count > 1 & Non.Missing.Grade.List == '11')

is_grade_all_missing <- grade_all_missing$School.Code
is_grade_all_missing_but_11 <- grade_all_missing_but_11$School.Code

Missing values

Noticeably there are a lot of missing values. For some of the student subgroups, the number of missing values are as high as more than 50%. In average, there are more than 2 missing values for every row. The specific distribution could be seen below.

Besides NA value, there are also a lot of * in the table. These two types of missing values are generated by two different mechanism:

  • NA: comes from the pivoting process, i.e. there is not even a row for that specific student group for that specific school
  • *: comes from the original dataset (the original dataset has no NA value) It is for privacy reasons, and theoretically indicates small values.
# NA for original data
caaspp22 %>% summarise_all(~ sum(is.na(.))) # NA in each column
##   County.Code District.Code School.Code  Filler Test.Year Student.Group.ID
## 1           0             0           0 3855781         0                0
##   Test.Type Total.Tested.at.Reporting.Level
## 1         0                               0
##   Total.Tested.with.Scores.at.Reporting.Level Grade Test.ID Students.Enrolled
## 1                                           0     0       0                 0
##   Students.Tested Mean.Scale.Score Percentage.Standard.Exceeded
## 1               0                0                            0
##   Percentage.Standard.Met Percentage.Standard.Met.and.Above
## 1                       0                                 0
##   Percentage.Standard.Nearly.Met Percentage.Standard.Not.Met
## 1                              0                           0
##   Students.with.Scores Area.1.Percentage.Above.Standard
## 1                    0                                0
##   Area.1.Percentage.Near.Standard Area.1.Percentage.Below.Standard
## 1                               0                                0
##   Area.2.Percentage.Above.Standard Area.2.Percentage.Near.Standard
## 1                                0                               0
##   Area.2.Percentage.Below.Standard Area.3.Percentage.Above.Standard
## 1                                0                                0
##   Area.3.Percentage.Near.Standard Area.3.Percentage.Below.Standard
## 1                               0                                0
##   Area.4.Percentage.Above.Standard Area.4.Percentage.Near.Standard
## 1                                0                               0
##   Area.4.Percentage.Below.Standard Type.ID
## 1                                0       0
table(rowSums(is.na(caaspp22))) # NA in each row
## 
##       1 
## 3855781
# NA for pivoted 
caaspp22_ela_pivoted %>% summarise_all(~ sum(is.na(.))) # NA in each column
## # A tibble: 1 × 15
##   School.Code All S…¹ Female Econo…² Black…³ Ameri…⁴ Asian Hispa…⁵ White Not a…⁶
##         <int>   <int>  <int>   <int>   <int>   <int> <int>   <int> <int>   <int>
## 1           0       0    271     152    1985    5515  2405     214   611     992
## # … with 5 more variables: `Reported disabilities` <int>,
## #   `Two or more races` <int>, `EL (English learner)` <int>, Filipino <int>,
## #   `Native Hawaiian or Pacific Islander` <int>, and abbreviated variable names
## #   ¹​`All Students`, ²​`Economically disadvantaged`,
## #   ³​`Black or African American`, ⁴​`American Indian or Alaska Native`,
## #   ⁵​`Hispanic or Latino`, ⁶​`Not a high school graduate`
table(rowSums(is.na(caaspp22_ela_pivoted))) # NA in each row
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12 
## 2046 2584 2007 1208  886  567  358  197  124  100  101   66   13
count_asterick <- function(x) length(which(x == '*'))

# * for original data
apply(caaspp22, 2, count_asterick) # * in each column
##                                 County.Code 
##                                           0 
##                               District.Code 
##                                           0 
##                                 School.Code 
##                                           0 
##                                      Filler 
##                                           0 
##                                   Test.Year 
##                                           0 
##                            Student.Group.ID 
##                                           0 
##                                   Test.Type 
##                                           0 
##             Total.Tested.at.Reporting.Level 
##                                      450332 
## Total.Tested.with.Scores.at.Reporting.Level 
##                                      450471 
##                                       Grade 
##                                           0 
##                                     Test.ID 
##                                           0 
##                           Students.Enrolled 
##                                      978755 
##                             Students.Tested 
##                                      942268 
##                            Mean.Scale.Score 
##                                     1308034 
##                Percentage.Standard.Exceeded 
##                                     1622471 
##                     Percentage.Standard.Met 
##                                     1622471 
##           Percentage.Standard.Met.and.Above 
##                                     1622471 
##              Percentage.Standard.Nearly.Met 
##                                     1622471 
##                 Percentage.Standard.Not.Met 
##                                     1622471 
##                        Students.with.Scores 
##                                      941724 
##            Area.1.Percentage.Above.Standard 
##                                     2316449 
##             Area.1.Percentage.Near.Standard 
##                                     2316449 
##            Area.1.Percentage.Below.Standard 
##                                     2316449 
##            Area.2.Percentage.Above.Standard 
##                                     2316449 
##             Area.2.Percentage.Near.Standard 
##                                     2316449 
##            Area.2.Percentage.Below.Standard 
##                                     2316449 
##            Area.3.Percentage.Above.Standard 
##                                     2316449 
##             Area.3.Percentage.Near.Standard 
##                                     2316449 
##            Area.3.Percentage.Below.Standard 
##                                     2316449 
##            Area.4.Percentage.Above.Standard 
##                                     2316449 
##             Area.4.Percentage.Near.Standard 
##                                     2316449 
##            Area.4.Percentage.Below.Standard 
##                                     2316449 
##                                     Type.ID 
##                                           0
table(apply(caaspp22, 1, count_asterick)) # * in each row
## 
##       0       1       2       3       4      12      17      18      19      20 
## 1453840   63003    1640   20574     275  693978  135172  545575     887   27025 
##      21      22      23 
##  498665  171487  243660
# * for pivoted
apply(caaspp22_ela_pivoted, 2, count_asterick) # * in each column
##                         School.Code                        All Students 
##                                   0                                 287 
##                              Female          Economically disadvantaged 
##                                 320                                 326 
##           Black or African American    American Indian or Alaska Native 
##                                2511                                3686 
##                               Asian                  Hispanic or Latino 
##                                2106                                 386 
##                               White          Not a high school graduate 
##                                1316                                1385 
##               Reported disabilities                   Two or more races 
##                                 676                                1947 
##                EL (English learner)                            Filipino 
##                                 861                                2697 
## Native Hawaiian or Pacific Islander 
##                                3212
table(apply(caaspp22_ela_pivoted, 1, count_asterick)) # * in each row
## 
##    0    1    2    3    4    5    6    7    8 
## 1119 2601 2945 1983 1022  402  131   40   14

The way we handle the missing values involve two steps. For all * cells, we replace it with n=1 to show it is an arbitrarily small value. For all NA cells, we replace it with 0 assuming the reason why there is missing row is because the school does not have that type of students.

In addition to replacing missing values by imputing, we also process them by dropping observations.

  • we drop all 223 schools where 0 students, or 287 schools with * in All Students column
  • we drop those schools with missing Mean.Scale.Score data
  • we drop those schools with atypical grade scope
caaspp22_ela_filled <- data.frame(caaspp22_ela_pivoted) # duplicate 
  # merge on grade data
  # merge on outcome data

caaspp22_ela_filled <- caaspp22_ela_filled |>
  # drop schools with too few students
  filter(All.Students != 0 & All.Students != '*') |>
  # drop those schools with missing `Mean.Scale.Score` data  
  filter(!School.Code %in% is_grade_all_missing)  |> 
  filter(!School.Code %in% is_grade_all_missing_but_11)  |> 
  # drop those schools with atypical grade scope
  filter(!School.Code %in% is_grade_atypical_span) 
  

caaspp22_ela_filled[caaspp22_ela_filled=='*'] <- '1'    # *s replaced by 1
caaspp22_ela_filled[is.na(caaspp22_ela_filled)] <- '0'  # NA replaced by 0
caaspp22_ela_filled <- caaspp22_ela_filled %>% mutate_if(is.character, as.numeric)
col_names <- names(caaspp22_ela_filled)[3:length(names(caaspp22_ela_filled))]
for (col_name in col_names) {
  # rename with the prefix - Percentage
  name <- paste0('Perc.', col_name)
  # calculate the percentage (%)
  caaspp22_ela_filled[name] <- caaspp22_ela_filled[col_name] / caaspp22_ela_filled$`All.Students` * 100
}
caaspp22_ela_demo <- caaspp22_ela_filled %>% select(-all_of(col_names))

write.csv(caaspp22_ela_demo,file='data/caaspp22_ela_demo.csv')

Outcome Encoding

#df by grade
caaspp_ela_outcome <- caaspp22 |>
  select(School.Code, Student.Group.ID, Test.ID, Grade, Students.Enrolled, Students.Tested, Mean.Scale.Score) |>
  # select 10258 schools (1 more since there is one school only have math scores)
  filter(School.Code > 0) |>
  # select only ELA results
  filter(Test.ID==1) |>
  # select `All Students`
  filter(Student.Group.ID == 1) |>
  # only for grade 3 - 8
  filter(Grade == 3 | Grade == 4 | Grade == 5 | Grade == 6 | Grade == 7 | Grade == 8) |>
  select(School.Code, Grade, Students.Enrolled, Students.Tested, Mean.Scale.Score)

#Grade 3 distance from score (2432)
grade_3 <- caaspp_ela_outcome |>
  filter(Grade == 3) |>
  mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2432)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2432`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 4 distance from score (2473)
grade_4 <- caaspp_ela_outcome |>
  filter(Grade == 4) |>
  mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2473)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2473`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 5 distance from score (2502)
grade_5 <- caaspp_ela_outcome |>
  filter(Grade == 5) |>
  mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2502)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2502`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 6 distance from score (2618)
grade_6 <- caaspp_ela_outcome |>
  filter(Grade == 6) |>
  mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2618)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2618`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 7 distance from score (2649)
grade_7 <- caaspp_ela_outcome |>
  filter(Grade == 7) |>
  mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2649)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2649`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 8 distance from score (2668)
grade_8 <- caaspp_ela_outcome |>
  filter(Grade == 8) |>
  mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2668)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2668`.
## Caused by warning:
## ! NAs introduced by coercion
caaspp_ela_score_distance <- rbind(grade_3, grade_4, grade_5, grade_6, grade_7, grade_8) |>
  na.omit()

caaspp_ela_aggregate <- caaspp_ela_score_distance |>
  group_by(School.Code) |>
  summarise_at(vars(distance_from_score_ela), list(name = mean)) |>
  rename("Avg.Score.Dist"="name")

Output

# output a file attaching score results to demo data
caaspp22_ela_score <- caaspp22_ela_demo |> left_join(y=caaspp_ela_aggregate, by = c("School.Code" = "School.Code"))

write.csv(caaspp22_ela_score, file='data/caaspp22_ela_score.csv')
caaspp22_math_score <- read_csv('data/caaspp22_math_score.csv') |>
  select(c('School.Code','Avg.Score.Dist.Math'))
## New names:
## Rows: 7294 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (17): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
caaspp22_ela_score <- read_csv('data/caaspp22_ela_score.csv')
## New names:
## Rows: 7291 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (17): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
caaspp22_score <- caaspp22_ela_score |>
  inner_join(y=caaspp22_math_score, by = c("School.Code" = "School.Code")) |>
  select(-1)

write.csv(caaspp22_score, file='data/caaspp22_score.csv')

Modeling

Pre-processing

Before running models, we would like to conduct the proper preprocessing.

# 7291 Schools with the non-trivial size, proper grade span, and non-missing grades
caaspp22_final <- read_csv('data/caaspp22_ela_demo.csv') |>
    # drop the column to set School.Code as index
    column_to_rownames(var = 'School.Code') |>
    select(-'...1') |> 
    # standardize all columns
    mutate(across(where(is.numeric), scale))
## New names:
## Rows: 7291 Columns: 16
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (16): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# no missing values
# caaspp22_ela[is.na(caaspp22_ela), ]

caaspp22_score <- read_csv('data/caaspp22_score.csv')
## New names:
## Rows: 7288 Columns: 18
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (18): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Since scaling a variable is a linear transformation and it will not change the distribution of the variable so it does not matter if the variable has a non-normal distribution, we don’t have to concern about the outlier School, City of Angels at Los Angeles Unified, which is way greater than all other schools (n=6421>>3763, the second large).

Nearest Neighbor

For NN, we use the nn2 function from RANN package, which finds the k nearest neighbours for every point in a given dataset in O(N log N) time using Arya and Mount’s ANN library (v1.1.3). The distance is computed using the L2 (Euclidean) metric. For alternative metric, one could see package ‘RANN.L1’ for the same functionality using the L1 (Manhattan, taxicab) metric.

Notice that the built-in knn function runs a k-nearest neighbour classification for test set from training set, which does not apply for our situation.

#NN search: https://search.r-project.org/CRAN/refmans/RANN/html/nn2.html

# k: The maximum number of nearest neighbours to compute. 
# to generate top 10 nearest neighbours we set k = 11 to offset oneself
nearest <- nn2(caaspp22_final, caaspp22_final, k=11)
ravenswood_school_codes <- caaspp22 %>% 
                                filter(District.Code=='68999') %>% 
                                filter(Type.ID==7 | Type.ID==9 | Type.ID==10) %>%
                                select(School.Code) %>% 
                                unique() %>% 
                                .$School.Code

top10_ravenswood <- data.frame(Rank=numeric(0), 
                               Dist=numeric(0),
                               Center=numeric(0),
                               School.Code=numeric(0))

for (school in ravenswood_school_codes) {
  
  # each ravenswood school has ten rows, i.e. top ten neighbors 
  row_idx <- which(rownames(caaspp22_final) == as.character(school)) 
  top10_id <- nearest$nn.idx[row_idx,]
  top10_ds <-nearest$nn.dists[row_idx,]
  
  # each row a neighbor with info about rank,dist, school.code
  for (rk in 1:11) {
    nn_idx <- top10_id[rk]
    neighbor_code <- rownames(caaspp22_final)[nn_idx]
    top10_ravenswood[nrow(top10_ravenswood) + 1, ] <- c(
      rk-1, top10_ds[rk], school, neighbor_code
    )  
  }
  
}

top10_ravenswood <- top10_ravenswood |> 
  mutate_if(is.character, as.numeric) |>
  left_join(y=entities, by = c("School.Code" = "School Code")) |>
  left_join(y=caaspp22_score, by = c("School.Code" = "School.Code"))

write.csv(top10_ravenswood, file='data/top10_ravenswood.csv')

KNN regression

#KNN regression: https://www.datatechnotes.com/2020/10/knn-regresion-example-in-r.html

df <- readr::read_csv('data/caaspp22_score.csv') |>
    # drop the column to set School.Code as index
    tibble::column_to_rownames(var = 'School.Code') |>
    # scale every column
    dplyr::mutate(across(2:length(df), scale))
## New names:
## Rows: 7288 Columns: 18
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (18): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# ravenswood results as test set 
test <- df |>
  dplyr::filter(rownames(df) %in% ravenswood_school_codes)
test_x = test[, -c(1, length(df)-1, length(df))]
test_ela = test[, length(df)-1]
test_math = test[, length(df)]

# create train and validation set 
df <- df |> 
  dplyr::filter(!rownames(df) %in% ravenswood_school_codes) 
train <- df |> 
  dplyr::sample_frac(0.85)
val  <- dplyr::anti_join(df, train, by ='...1')

train_x = train[, -c(1, length(df)-1, length(df))]
train_ela = train[, length(df)-1]
train_math = train[, length(df)]

val_x = val[, -c(1, length(df)-1, length(df))]
val_ela = val[, length(df)-1]
val_math = val[, length(df)]

### ELA

# train the KNN model 
knnmodel = knnreg(train_x, train_ela)
str(knnmodel)
## List of 3
##  $ learn  :List of 2
##   ..$ y: num [1:6190] -38.52 -114.35 -148.27 2.27 12.07 ...
##   ..$ X:'data.frame':    6190 obs. of  14 variables:
##   .. ..$ All.Students                            : num [1:6190, 1] -0.7015 0.2974 0.0164 -0.6521 0.0164 ...
##   .. .. ..- attr(*, "scaled:center")= num 343
##   .. .. ..- attr(*, "scaled:scale")= num 263
##   .. ..$ Perc.Female                             : num [1:6190] 48.7 48.5 43.8 46.8 46.7 ...
##   .. ..$ Perc.Economically.disadvantaged         : num [1:6190] 62 73.4 89.6 10.5 90.5 ...
##   .. ..$ Perc.Black.or.African.American          : num [1:6190] 0.633 9.264 53.602 3.509 4.323 ...
##   .. ..$ Perc.American.Indian.or.Alaska.Native   : num [1:6190] 0.633 0.238 0 0 0.288 ...
##   .. ..$ Perc.Asian                              : num [1:6190] 37.975 0.238 0.288 5.848 0.288 ...
##   .. ..$ Perc.Hispanic.or.Latino                 : num [1:6190] 52.5 71 41.5 28.7 88.2 ...
##   .. ..$ Perc.White                              : num [1:6190] 0.633 15.914 1.153 49.123 4.611 ...
##   .. ..$ Perc.Not.a.high.school.graduate         : num [1:6190] 22.152 5.938 12.68 0.585 11.527 ...
##   .. ..$ Perc.Reported.disabilities              : num [1:6190] 10.76 8.55 15.56 9.94 7.78 ...
##   .. ..$ Perc.Two.or.more.races                  : num [1:6190] 5.7 2.14 1.44 11.7 2.02 ...
##   .. ..$ Perc.EL..English.learner.               : num [1:6190] 39.241 5.938 5.476 0.585 16.138 ...
##   .. ..$ Perc.Filipino                           : num [1:6190] 0.633 0.238 0.288 0.585 0 ...
##   .. ..$ Perc.Native.Hawaiian.or.Pacific.Islander: num [1:6190] 0 0.238 0.288 0.585 0.288 ...
##  $ k      : num 5
##  $ theDots: list()
##  - attr(*, "class")= chr "knnreg"
# predict on the validation set and check accuracy
pred_y = predict(knnmodel, data.frame(val_x))

mse = mean((val_ela - pred_y)^2)
mae = caret::MAE(val_ela, pred_y)
rmse = caret::RMSE(val_ela, pred_y)

cat("MSE: ", mse, "MAE: ", mae, " RMSE: ", rmse)
## MSE:  1893.582 MAE:  34.23814  RMSE:  43.51531
# predict on ravenswood schools
pred_y = predict(knnmodel, data.frame(test_x))
print(data.frame(test_ela, pred_y, test_ela-pred_y))
##     test_ela     pred_y test_ela...pred_y
## 1 -129.46667  -80.29333         -49.17333
## 2 -115.01667 -135.62167          20.60500
## 3 -110.56667  -73.16333         -37.40333
## 4 -195.23333 -109.72167         -85.51167
## 5  -83.86667  -75.50667          -8.36000
## 6 -102.90000  -90.24667         -12.65333
mse = mean((test_ela - pred_y)^2)
mae = caret::MAE(test_ela, pred_y)
rmse = caret::RMSE(test_ela, pred_y)

cat("MSE: ", mse, "MAE: ", mae, " RMSE: ", rmse)
## MSE:  1963.972 MAE:  35.61778  RMSE:  44.31673
### Math

# train the KNN model 
knnmodel_math = knnreg(train_x, train_math)
str(knnmodel_math)
## List of 3
##  $ learn  :List of 2
##   ..$ y: num [1:6190] -32.5 -103.3 -107 22.6 -14.2 ...
##   ..$ X:'data.frame':    6190 obs. of  14 variables:
##   .. ..$ All.Students                            : num [1:6190, 1] -0.7015 0.2974 0.0164 -0.6521 0.0164 ...
##   .. .. ..- attr(*, "scaled:center")= num 343
##   .. .. ..- attr(*, "scaled:scale")= num 263
##   .. ..$ Perc.Female                             : num [1:6190] 48.7 48.5 43.8 46.8 46.7 ...
##   .. ..$ Perc.Economically.disadvantaged         : num [1:6190] 62 73.4 89.6 10.5 90.5 ...
##   .. ..$ Perc.Black.or.African.American          : num [1:6190] 0.633 9.264 53.602 3.509 4.323 ...
##   .. ..$ Perc.American.Indian.or.Alaska.Native   : num [1:6190] 0.633 0.238 0 0 0.288 ...
##   .. ..$ Perc.Asian                              : num [1:6190] 37.975 0.238 0.288 5.848 0.288 ...
##   .. ..$ Perc.Hispanic.or.Latino                 : num [1:6190] 52.5 71 41.5 28.7 88.2 ...
##   .. ..$ Perc.White                              : num [1:6190] 0.633 15.914 1.153 49.123 4.611 ...
##   .. ..$ Perc.Not.a.high.school.graduate         : num [1:6190] 22.152 5.938 12.68 0.585 11.527 ...
##   .. ..$ Perc.Reported.disabilities              : num [1:6190] 10.76 8.55 15.56 9.94 7.78 ...
##   .. ..$ Perc.Two.or.more.races                  : num [1:6190] 5.7 2.14 1.44 11.7 2.02 ...
##   .. ..$ Perc.EL..English.learner.               : num [1:6190] 39.241 5.938 5.476 0.585 16.138 ...
##   .. ..$ Perc.Filipino                           : num [1:6190] 0.633 0.238 0.288 0.585 0 ...
##   .. ..$ Perc.Native.Hawaiian.or.Pacific.Islander: num [1:6190] 0 0.238 0.288 0.585 0.288 ...
##  $ k      : num 5
##  $ theDots: list()
##  - attr(*, "class")= chr "knnreg"
# predict on the validation set and check accuracy
pred_y = predict(knnmodel_math, data.frame(val_x))

mse = mean((val_math - pred_y)^2)
mae = caret::MAE(val_math, pred_y)
rmse = caret::RMSE(val_math, pred_y)

cat("MSE: ", mse, "MAE: ", mae, " RMSE: ", rmse)
## MSE:  804.1747 MAE:  21.92441  RMSE:  28.35797
# predict on ravenswood schools
pred_y = predict(knnmodel_math, data.frame(test_x))
print(data.frame(test_math, pred_y, test_math-pred_y))
##    test_math     pred_y test_math...pred_y
## 1 -117.30000  -88.16333        -29.1366667
## 2 -106.10000 -106.73333          0.6333333
## 3  -94.73333  -75.01000        -19.7233333
## 4 -148.20000 -100.82333        -47.3766667
## 5  -94.26667  -74.11167        -20.1550000
## 6 -126.43333  -92.69000        -33.7433333
mse = mean((test_math - pred_y)^2)
mae = caret::MAE(test_math, pred_y)
rmse = caret::RMSE(test_math, pred_y)

cat("MSE: ", mse, "MAE: ", mae, " RMSE: ", rmse)
## MSE:  837.9569 MAE:  25.12806  RMSE:  28.94749

OLS & LASSO

One limitation of KNN is its lack in explanability. To complement it, we use other methods to help understand.

First we conduct regular multivariate linear regression.

# multivariate OLS
model_ela <- lm(Avg.Score.Dist ~ ., data = df[, -c(1, length(df))])
summary(model_ela)
## 
## Call:
## lm(formula = Avg.Score.Dist ~ ., data = df[, -c(1, length(df))])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -201.26  -23.82    5.02   28.11  414.25 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              -263.92540   56.34968  -4.684 2.87e-06
## All.Students                              -19.37083    0.52431 -36.946  < 2e-16
## Perc.Female                                 0.51214    0.12102   4.232 2.35e-05
## Perc.Economically.disadvantaged            -1.00550    0.03542 -28.389  < 2e-16
## Perc.Black.or.African.American              2.13116    0.56541   3.769 0.000165
## Perc.American.Indian.or.Alaska.Native       1.05047    0.59550   1.764 0.077771
## Perc.Asian                                  3.55089    0.56313   6.306 3.04e-10
## Perc.Hispanic.or.Latino                     2.82995    0.56386   5.019 5.32e-07
## Perc.White                                  2.56865    0.56455   4.550 5.45e-06
## Perc.Not.a.high.school.graduate            -0.85200    0.06478 -13.153  < 2e-16
## Perc.Reported.disabilities                 -1.22607    0.08819 -13.902  < 2e-16
## Perc.Two.or.more.races                      2.93370    0.56451   5.197 2.08e-07
## Perc.EL..English.learner.                  -0.07578    0.04854  -1.561 0.118486
## Perc.Filipino                               2.79962    0.57601   4.860 1.20e-06
## Perc.Native.Hawaiian.or.Pacific.Islander   -2.90631    0.86670  -3.353 0.000803
##                                             
## (Intercept)                              ***
## All.Students                             ***
## Perc.Female                              ***
## Perc.Economically.disadvantaged          ***
## Perc.Black.or.African.American           ***
## Perc.American.Indian.or.Alaska.Native    .  
## Perc.Asian                               ***
## Perc.Hispanic.or.Latino                  ***
## Perc.White                               ***
## Perc.Not.a.high.school.graduate          ***
## Perc.Reported.disabilities               ***
## Perc.Two.or.more.races                   ***
## Perc.EL..English.learner.                   
## Perc.Filipino                            ***
## Perc.Native.Hawaiian.or.Pacific.Islander ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.81 on 7267 degrees of freedom
## Multiple R-squared:  0.5667, Adjusted R-squared:  0.5658 
## F-statistic: 678.8 on 14 and 7267 DF,  p-value: < 2.2e-16
model_math <- lm(Avg.Score.Dist.Math ~ ., data = df[, -c(1, length(df)-1)])
summary(model_math)
## 
## Call:
## lm(formula = Avg.Score.Dist.Math ~ ., data = df[, -c(1, length(df) - 
##     1)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -147.357  -17.525    1.524   19.190  138.075 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              -115.36125   40.31582  -2.861 0.004229
## All.Students                               -6.67623    0.37512 -17.798  < 2e-16
## Perc.Female                                 0.05550    0.08659   0.641 0.521553
## Perc.Economically.disadvantaged            -1.00025    0.02534 -39.472  < 2e-16
## Perc.Black.or.African.American              0.82341    0.40453   2.035 0.041839
## Perc.American.Indian.or.Alaska.Native       0.30895    0.42605   0.725 0.468387
## Perc.Asian                                  2.52593    0.40289   6.269 3.83e-10
## Perc.Hispanic.or.Latino                     1.53281    0.40342   3.800 0.000146
## Perc.White                                  1.50118    0.40391   3.717 0.000203
## Perc.Not.a.high.school.graduate            -0.33168    0.04635  -7.157 9.08e-13
## Perc.Reported.disabilities                 -1.17273    0.06310 -18.586  < 2e-16
## Perc.Two.or.more.races                      1.50681    0.40389   3.731 0.000192
## Perc.EL..English.learner.                  -0.32867    0.03473  -9.465  < 2e-16
## Perc.Filipino                               1.39945    0.41211   3.396 0.000688
## Perc.Native.Hawaiian.or.Pacific.Islander   -3.22526    0.62009  -5.201 2.03e-07
##                                             
## (Intercept)                              ** 
## All.Students                             ***
## Perc.Female                                 
## Perc.Economically.disadvantaged          ***
## Perc.Black.or.African.American           *  
## Perc.American.Indian.or.Alaska.Native       
## Perc.Asian                               ***
## Perc.Hispanic.or.Latino                  ***
## Perc.White                               ***
## Perc.Not.a.high.school.graduate          ***
## Perc.Reported.disabilities               ***
## Perc.Two.or.more.races                   ***
## Perc.EL..English.learner.                ***
## Perc.Filipino                            ***
## Perc.Native.Hawaiian.or.Pacific.Islander ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.2 on 7267 degrees of freedom
## Multiple R-squared:  0.7127, Adjusted R-squared:  0.7122 
## F-statistic:  1288 on 14 and 7267 DF,  p-value: < 2.2e-16

Then we use a LASSO regression to conduct feature selection.

mod_cv <- cv.glmnet(x=as.matrix(df[, -c(1, length(df)-1, length(df))]), 
                    y=df[, length(df)-1], 
                    family='gaussian',
                    alpha=1)

# cvm : The mean cross-validated error - a vector of length length(lambda)
# lambda.min : the λ at which the minimal MSE is achieved.
# lambda.1se : the largest λ at which the MSE is within one standard error of the minimal MSE.
   
plot(log(mod_cv$lambda), mod_cv$cvm)

plot(mod_cv) 

coef(mod_cv, c(mod_cv$lambda.min,
               mod_cv$lambda.1se))
## 15 x 2 sparse Matrix of class "dgCMatrix"
##                                                    s1           s2
## (Intercept)                               -8.05632673  17.50563149
## All.Students                             -18.56218882 -15.60460163
## Perc.Female                                0.50345894   0.09648271
## Perc.Economically.disadvantaged           -1.00231698  -0.93519601
## Perc.Black.or.African.American            -0.42992351  -0.36186677
## Perc.American.Indian.or.Alaska.Native     -1.53262561  -0.97287735
## Perc.Asian                                 0.98125145   0.77302585
## Perc.Hispanic.or.Latino                    0.24701532   .         
## Perc.White                                 .            .         
## Perc.Not.a.high.school.graduate           -0.84308762  -0.59572932
## Perc.Reported.disabilities                -1.23666135  -1.02836423
## Perc.Two.or.more.races                     0.39151311   .         
## Perc.EL..English.learner.                 -0.05927215   .         
## Perc.Filipino                              0.24434064   .         
## Perc.Native.Hawaiian.or.Pacific.Islander  -5.49456756  -3.12186393
print(paste(mod_cv$lambda.min,
            log(mod_cv$lambda.min)))
## [1] "0.058570690798936 -2.83752086453416"
print(paste(mod_cv$lambda.1se,
            log(mod_cv$lambda.1se)))
## [1] "2.20514891017484 0.790795039577674"
best_model_ela <- glmnet(x=as.matrix(df[, -c(1, length(df)-1, length(df))]), 
                    y=df[, length(df)-1], 
                    family='gaussian',
                    lambda = mod_cv$lambda.1se,
                    alpha=1)
mod_cv <- cv.glmnet(x=as.matrix(df[, -c(1, length(df)-1, length(df))]), 
                    y=df[, length(df)], 
                    family='gaussian',
                    alpha=1)

# cvm : The mean cross-validated error - a vector of length length(lambda)
# lambda.min : the λ at which the minimal MSE is achieved.
# lambda.1se : the largest λ at which the MSE is within one standard error of the minimal MSE.
   
plot(log(mod_cv$lambda), mod_cv$cvm)

plot(mod_cv) 

coef(mod_cv, c(mod_cv$lambda.min,
               mod_cv$lambda.1se))
## 15 x 2 sparse Matrix of class "dgCMatrix"
##                                                   s1         s2
## (Intercept)                              35.62117768 33.3423235
## All.Students                             -5.98740678 -4.1821726
## Perc.Female                               0.02600928  .        
## Perc.Economically.disadvantaged          -0.99118677 -1.0320229
## Perc.Black.or.African.American           -0.67091445 -0.5177721
## Perc.American.Indian.or.Alaska.Native    -1.16385580 -0.5996723
## Perc.Asian                                1.00219682  0.9271132
## Perc.Hispanic.or.Latino                   .           .        
## Perc.White                                .           .        
## Perc.Not.a.high.school.graduate          -0.31245996 -0.2680361
## Perc.Reported.disabilities               -1.16475994 -0.9811858
## Perc.Two.or.more.races                    .           .        
## Perc.EL..English.learner.                -0.30127901 -0.1898708
## Perc.Filipino                            -0.02145322  .        
## Perc.Native.Hawaiian.or.Pacific.Islander -4.69212048 -3.4073931
print(paste(mod_cv$lambda.min,
            log(mod_cv$lambda.min)))
## [1] "0.174186992469576 -1.74762588744596"
print(paste(mod_cv$lambda.1se,
            log(mod_cv$lambda.1se)))
## [1] "1.48015995341616 0.392150158568704"
drop_by_LASSO <- c('Perc.Hispanic.or.Latino', 'Perc.White', 'Perc.Two.or.more.races', 'Perc.Filipino')
caaspp22_final_drop <- caaspp22_final |>
  select(-drop_by_LASSO)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(drop_by_LASSO)
## 
##   # Now:
##   data %>% select(all_of(drop_by_LASSO))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

K-means Clustering

# scale. = FALSE/TRUE will return different proportion of variance explained
# but the contributions of each variable look alike
res.pca <- prcomp(caaspp22_final_drop, center = FALSE, scale. = FALSE)
summary(res.pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.6128 1.1855 1.0937 1.0501 0.97461 0.96213 0.85356
## Proportion of Variance 0.2601 0.1406 0.1196 0.1103 0.09499 0.09257 0.07286
## Cumulative Proportion  0.2601 0.4007 0.5203 0.6305 0.72552 0.81809 0.89095
##                            PC8     PC9    PC10
## Standard deviation     0.77259 0.50142 0.49212
## Proportion of Variance 0.05969 0.02514 0.02422
## Cumulative Proportion  0.95064 0.97578 1.00000
pcaCharts <- function(x) {
    x.var <- x$sdev ^ 2
    x.pvar <- x.var/sum(x.var)
    print("proportions of variance:")
    print(x.pvar)
    
    par(mfrow=c(2,2))
    plot(x.pvar,xlab="Principal component", ylab="Proportion of variance explained", ylim=c(0,1), type='b')
    plot(cumsum(x.pvar),xlab="Principal component", ylab="Cumulative Proportion of variance explained", ylim=c(0,1), type='b')
    screeplot(x)
    screeplot(x,type="l")
    par(mfrow=c(1,1))
    corrplot(res.pca$rotation, is.corr=TRUE)
    corrplot(res.pca$rotation, is.corr=FALSE)
}

pcaCharts(res.pca)
## [1] "proportions of variance:"
##  [1] 0.26010467 0.14054946 0.11961206 0.11027285 0.09498573 0.09256889
##  [7] 0.07285719 0.05968890 0.02514185 0.02421841

fviz_pca_var(res.pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#EDE9D5", "#FFE15D", "#FC4E07"),
             repel = TRUE,    # Avoid text overlapping
             )

caaspp22_pca = as.data.frame(-res.pca$x[,1:2])
peer_school_codes <- top10_ravenswood %>% 
  filter(Rank != 0) %>% 
  filter(Center == 6044309) %>%
  .$School.Code

#cols <- c("snow3", "#55AD89", "#EF6F6A")
cols <- c("snow3", "darkgoldenrod2", "#EF6F6A")

caaspp22_pca <- read_csv('data/caaspp22_ela_demo.csv') |>
    # drop the column to set School.Code as index
    column_to_rownames(var = 'School.Code') |>
    select(-'...1') |>
    filter('School.Code' != '1996115') #|>
## New names:
## Rows: 7291 Columns: 16
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (16): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
    #mutate(across(where(is.numeric), scale))

caaspp22_pca_biplot <- caaspp22_pca |>
  rownames_to_column('School.Code') %>%
  mutate(School.Group = ifelse(
                 School.Code %in% ravenswood_school_codes, "Ravenswood",
                 ifelse(School.Code %in% peer_school_codes, "Peer School", "Other")))

ggplot(caaspp22_pca_biplot, 
       aes(x = Perc.Economically.disadvantaged, 
           y = Perc.Not.a.high.school.graduate,
           color = School.Group
          )) +
  geom_point(#color="cornflowerblue", 
             #size = 1, 
             aes(alpha = School.Group,
                 size= School.Group)) +
  geom_text(aes(label=ifelse(School.Code==6044309,'','')),
            hjust=.2,
            vjust=.2) + 
  scale_color_manual(values = cols) + 
  scale_size_discrete(range = c(0.8, 2)) + 
  scale_alpha_discrete(range = c(0.35, 1)) +
  labs(x = "Perc.Economically.disadvantaged",
       y = "Perc.Not.a.high.school.graduate",
       title = "PCA",
       subtitle = "Ravenswood and Peer Schools") +
  theme_minimal()
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

ggplot(caaspp22_pca_biplot, 
       aes(x = Perc.Female, 
           y = All.Students,
           color = School.Group
          )) +
  geom_point(#color="cornflowerblue", 
             #size = 1, 
             aes(alpha = School.Group,
                 size= School.Group)) +
  geom_text(aes(label=ifelse(School.Code==6044309,'','')),
            hjust=.2,
            vjust=.2) + 
  scale_color_manual(values = cols) + 
  scale_size_discrete(range = c(0.8, 2)) + 
  scale_alpha_discrete(range = c(0.35, 1)) +
  labs(x = "Perc.Female",
       y = "All.Students",
       title = "PCA",
       subtitle = "Ravenswood and Peer Schools") +
  theme_minimal()
## Warning: Using size for a discrete variable is not advised.
## Using alpha for a discrete variable is not advised.

ggplot(caaspp22_pca_biplot, 
       aes(x = Perc.Reported.disabilities, 
           y = Perc.EL..English.learner.,
           color = School.Group
          )) +
  geom_point(#color="cornflowerblue", 
             #size = 1, 
             aes(alpha = School.Group,
                 size= School.Group)) +
  geom_text(aes(label=ifelse(School.Code==6044309,'','')),
            hjust=.2,
            vjust=.2) + 
  scale_color_manual(values = cols) + 
  scale_size_discrete(range = c(0.8, 2)) + 
  scale_alpha_discrete(range = c(0.35, 1)) +
  labs(x = "Perc.Reported.disabilities",
       y = "Perc.EL..English.learner.",
       title = "PCA",
       subtitle = "Ravenswood and Peer Schools") +
  theme_minimal()
## Warning: Using size for a discrete variable is not advised.
## Using alpha for a discrete variable is not advised.

fviz_nbclust(caaspp22_pca, 
             kmeans, 
             method='wss' # the elbow shows up when k = 4
             #method='silhouette' # similarly suggest k = 3
             )
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 364550)

res.km3 <- kmeans(caaspp22_pca, 3)
res.km4 <- kmeans(caaspp22_pca, 4)
#concern: not exactly same size
#print(res.km4)
fviz_cluster(res.km4, data=caaspp22_final, 
             geom="point")

#It’s possible to compute the mean of each variables by clusters using the original data:
#aggregate(USArrests, by=list(cluster=km.res$cluster), mean)
res.km3.eclust <- eclust(caaspp22_final, "kmeans", k = 3,
                 nstart = 25, graph = FALSE)
res.km4.eclust <- eclust(caaspp22_final, "kmeans", k = 4,
                 nstart = 25, graph = FALSE)

fviz_cluster(res.km3.eclust,
            geom = "point",
            ellipse.type = "norm",
            ellipse.level = 0.95,
            ellipse.alpha = 0.2,
            pointsize = 0.8,
            ggtheme=theme_bw())

fviz_cluster(res.km4.eclust,
            geom = "point",
            ellipse.type = "norm",
            ellipse.level = 0.95,
            ellipse.alpha = 0.2,
            pointsize = 0.8,
            ggtheme=theme_bw())

fviz_silhouette(res.km3.eclust)
##   cluster size ave.sil.width
## 1       1 1042          0.05
## 2       2 3734          0.30
## 3       3 2515          0.26

fviz_silhouette(res.km4.eclust)
##   cluster size ave.sil.width
## 1       1  748         -0.04
## 2       2  861          0.11
## 3       3 3177          0.37
## 4       4 2505          0.24